In [4]:
import pandas as pd
pd.read_csv("C:\\Users\\LENOVO\\Desktop\\Summer-Olympic-medals-1976-to-2008.csv",encoding='cp1252')
Out[4]:
| City | Year | Sport | Discipline | Event | Athlete | Gender | Country_Code | Country | Event_gender | Medal | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Montreal | 1976.0 | Aquatics | Diving | 3m springboard | KÖHLER, Christa | Women | GDR | East Germany | W | Silver |
| 1 | Montreal | 1976.0 | Aquatics | Diving | 3m springboard | KOSENKOV, Aleksandr | Men | URS | Soviet Union | M | Bronze |
| 2 | Montreal | 1976.0 | Aquatics | Diving | 3m springboard | BOGGS, Philip George | Men | USA | United States | M | Gold |
| 3 | Montreal | 1976.0 | Aquatics | Diving | 3m springboard | CAGNOTTO, Giorgio Franco | Men | ITA | Italy | M | Silver |
| 4 | Montreal | 1976.0 | Aquatics | Diving | 10m platform | WILSON, Deborah Keplar | Women | USA | United States | W | Bronze |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15428 | Beijing | 2008.0 | Wrestling | Wrestling Gre-R | 66 - 74kg | GUENOT, Christophe | Men | FRA | France | M | Bronze |
| 15429 | Beijing | 2008.0 | Wrestling | Wrestling Gre-R | 66 - 74kg | KVIRKELIA, Manuchar | Men | GEO | Georgia | M | Gold |
| 15430 | Beijing | 2008.0 | Wrestling | Wrestling Gre-R | 55 - 60kg | RAHIMOV, Vitaliy | Men | AZE | Azerbaijan | M | Silver |
| 15431 | Beijing | 2008.0 | Wrestling | Wrestling Gre-R | 60 - 66kg | GUENOT, Steeve | Men | FRA | France | M | Gold |
| 15432 | Beijing | 2008.0 | Wrestling | Wrestling Gre-R | 96 - 120kg | LOPEZ, Mijain | Men | CUB | Cuba | M | Gold |
15433 rows × 11 columns
In [6]:
#Data Cleaning and Preparation:
#Cleanse the dataset by handling missing values, incorrect entries, and data inconsistencies.
#Transform data into appropriate formats for analysis, such as converting years and medal types.
df = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\Summer-Olympic-medals-1976-to-2008.csv", encoding='latin1')
# Check for missing values
print(df.isnull().sum())
# Handle missing values
df['City'].fillna('Unknown', inplace=True)
df['Year'].fillna(df['Year'].mode()[0], inplace=True)
df['Sport'].fillna('Unknown', inplace=True)
df['Discipline'].fillna('Unknown', inplace=True)
df['Event'].fillna('Unknown', inplace=True)
df['Athlete'].fillna('Unknown', inplace=True)
df['Gender'].fillna('Unknown', inplace=True)
df['Country_Code'].fillna('Unknown', inplace=True)
df['Country'].fillna('Unknown', inplace=True)
df['Event_gender'].fillna('Unknown', inplace=True)
df['Medal'].fillna('None', inplace=True)
# Convert 'Year' to numeric and handle any errors
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
# Capitalize string columns to ensure consistency
df['Sport'] = df['Sport'].str.capitalize()
df['Discipline'] = df['Discipline'].str.capitalize()
df['Event'] = df['Event'].str.capitalize()
df['Athlete'] = df['Athlete'].str.title()
df['Gender'] = df['Gender'].str.capitalize()
df['Country'] = df['Country'].str.capitalize()
df['Event_gender'] = df['Event_gender'].str.capitalize()
df['Medal'] = df['Medal'].str.capitalize()
# Check for unique values in categorical columns to ensure consistency
print(df['Gender'].unique())
print(df['Event_gender'].unique())
print(df['Medal'].unique())
# Standardize 'Gender' and 'Event_gender' entries
df['Gender'] = df['Gender'].replace({'M': 'Men', 'W': 'Women'})
df['Event_gender'] = df['Event_gender'].replace({'M': 'Men', 'W': 'Women'})
# Convert 'Year' to integer
df['Year'] = df['Year'].astype(int)
# Ensure 'Medal' column has consistent capitalization
df['Medal'] = df['Medal'].str.capitalize()
# Create a mapping for medal types if needed
medal_mapping = {'Gold': 3, 'Silver': 2, 'Bronze': 1, 'None': 0}
df['Medal_Value'] = df['Medal'].map(medal_mapping)
print(df.head())
City 117
Year 117
Sport 117
Discipline 117
Event 117
Athlete 117
Gender 117
Country_Code 117
Country 117
Event_gender 117
Medal 117
dtype: int64
['Women' 'Men' 'Unknown']
['W' 'M' 'X' 'Unknown']
['Silver' 'Bronze' 'Gold' 'None']
City Year Sport Discipline Event \
0 Montreal 1976 Aquatics Diving 3m springboard
1 Montreal 1976 Aquatics Diving 3m springboard
2 Montreal 1976 Aquatics Diving 3m springboard
3 Montreal 1976 Aquatics Diving 3m springboard
4 Montreal 1976 Aquatics Diving 10m platform
Athlete Gender Country_Code Country Event_gender \
0 Köhler, Christa Women GDR East germany Women
1 Kosenkov, Aleksandr Men URS Soviet union Men
2 Boggs, Philip George Men USA United states Men
3 Cagnotto, Giorgio Franco Men ITA Italy Men
4 Wilson, Deborah Keplar Women USA United states Women
Medal Medal_Value
0 Silver 2
1 Bronze 1
2 Gold 3
3 Silver 2
4 Bronze 1
In [7]:
#Exploratory Data Analysis (EDA):
#Conduct initial explorations to understand the distribution of medals, number of participating nations, and other key metrics.
#Visualize the overall trends in data, such as total medals won over the years.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Distribution of Medals
medal_counts = df['Medal'].value_counts()
print(medal_counts)
sns.countplot(data=df, x='Medal', order=medal_counts.index)
plt.title('Distribution of Medals')
plt.xlabel('Medal')
plt.ylabel('Count')
plt.show()
# Number of Participating Nations
nations = df['Country'].nunique()
print(f"Number of participating nations: {nations}")
nation_medal_counts = df['Country'].value_counts().head(20)
sns.barplot(y=nation_medal_counts.index, x=nation_medal_counts.values)
plt.title('Number of Medals Won by Country (Top 20)')
plt.xlabel('Number of Medals')
plt.ylabel('Country')
plt.show()
# Total Medals Won Over the Years
medals_by_year = df.groupby('Year')['Medal'].count()
print(medals_by_year)
plt.figure(figsize=(10, 6))
sns.lineplot(data=medals_by_year)
plt.title('Total Medals Won Over the Years')
plt.xlabel('Year')
plt.ylabel('Number of Medals')
plt.show()
# Medals Won by Gender Over the Years
medals_by_gender_year = df.groupby(['Year', 'Gender'])['Medal'].count().unstack()
print(medals_by_gender_year)
medals_by_gender_year.plot(kind='bar', stacked=True, figsize=(12, 7))
plt.title('Medals Won by Gender Over the Years')
plt.xlabel('Year')
plt.ylabel('Number of Medals')
plt.legend(title='Gender')
plt.show()
# Top Athletes by Medals Won
top_athletes = df['Athlete'].value_counts().head(10)
print(top_athletes)
sns.barplot(y=top_athletes.index, x=top_athletes.values)
plt.title('Top Athletes by Medals Won')
plt.xlabel('Number of Medals')
plt.ylabel('Athlete')
plt.show()
Medal Bronze 5258 Gold 5042 Silver 5016 Name: count, dtype: int64
Number of participating nations: 127
Year 1976.0 1305 1980.0 1387 1984.0 1459 1988.0 1546 1992.0 1705 1996.0 1859 2000.0 2015 2004.0 1998 2008.0 2042 Name: Medal, dtype: int64
C:\Users\LENOVO\anaconda\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\LENOVO\anaconda\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
Gender Men Women Year 1976.0 924 381 1980.0 958 429 1984.0 973 486 1988.0 1011 535 1992.0 1105 600 1996.0 1082 777 2000.0 1126 889 2004.0 1099 899 2008.0 1110 932
Athlete PHELPS, Michael 16 TORRES, Dara 12 THOMPSON, Jenny 12 FISCHER, Birgit 12 ANDRIANOV, Nikolay 12 NEMOV, Alexei 12 COUGHLIN, Natalie 11 BIONDI, Matthew 11 VAN ALMSICK, Franziska 10 LEWIS, Carl 10 Name: count, dtype: int64
In [8]:
#Country-Level Performance Analysis:
#Analyze the performance of countries over the years in terms of medals won.
#Identify the most successful countries in the Olympics and explore factors contributing to their success.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load the data
df = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\Summer-Olympic-medals-1976-to-2008.csv", encoding='latin1')
# Calculate the total medals won by each country
total_medals_by_country = df['Country'].value_counts()
print(total_medals_by_country.head(10))
# Plot the total medals won by each country (Top 20)
plt.figure(figsize=(12, 8))
sns.barplot(y=total_medals_by_country.index[:20], x=total_medals_by_country.values[:20])
plt.title('Total Medals Won by Country (Top 20)')
plt.xlabel('Number of Medals')
plt.ylabel('Country')
plt.show()
# Identify the most successful countries
top_countries = total_medals_by_country.head(10).index
print(f"Top 10 Countries: {top_countries}")
# Filter data for the top countries
df_top_countries = df[df['Country'].isin(top_countries)]
# Group by Year and Country to get the total medals per year for each country
medals_by_country_year = df_top_countries.groupby(['Year', 'Country']).size().unstack()
# Plot the performance of top countries over time
plt.figure(figsize=(14, 8))
medals_by_country_year.plot(kind='line', marker='o', figsize=(14, 8))
plt.title('Performance of Top 10 Countries Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Medals')
plt.legend(title='Country')
plt.show()
Country United States 1992 Soviet Union 1021 Australia 798 Germany 691 China 679 Russia 638 East Germany 626 Italy 486 Romania 482 United Kingdom 467 Name: count, dtype: int64
Top 10 Countries: Index(['United States', 'Soviet Union', 'Australia', 'Germany', 'China',
'Russia', 'East Germany', 'Italy', 'Romania', 'United Kingdom'],
dtype='object', name='Country')
<Figure size 1400x800 with 0 Axes>
In [9]:
#Athlete Performance Metrics:
#Evaluate the achievements of athletes, identifying standout performers and their medal hauls.
#Analyze the performance based on athlete demographics like gender and age (if available).
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load the data
df = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\Summer-Olympic-medals-1976-to-2008.csv", encoding='latin1')
# Calculate the total medals won by each athlete
total_medals_by_athlete = df['Athlete'].value_counts()
print(total_medals_by_athlete.head(10))
# Plot the total medals won by each athlete (Top 10)
plt.figure(figsize=(12, 8))
sns.barplot(y=total_medals_by_athlete.index[:10], x=total_medals_by_athlete.values[:10])
plt.title('Total Medals Won by Athlete (Top 10)')
plt.xlabel('Number of Medals')
plt.ylabel('Athlete')
plt.show()
# Identify top 10 standout performers
top_athletes = total_medals_by_athlete.head(10).index
print(f"Top 10 Athletes: {top_athletes}")
# Filter data for the top athletes
df_top_athletes = df[df['Athlete'].isin(top_athletes)]
print(df_top_athletes)
# Calculate the total medals won by gender
medals_by_gender = df['Gender'].value_counts()
print(medals_by_gender)
# Plot the total medals won by gender
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='Gender', order=medals_by_gender.index)
plt.title('Total Medals Won by Gender')
plt.xlabel('Gender')
plt.ylabel('Number of Medals')
plt.show()
# Calculate the total medals won by gender over the years
medals_by_gender_year = df.groupby(['Year', 'Gender'])['Medal'].count().unstack()
print(medals_by_gender_year)
# Plot the total medals won by gender over the years
medals_by_gender_year.plot(kind='bar', stacked=True, figsize=(14, 8))
plt.title('Total Medals Won by Gender Over the Years')
plt.xlabel('Year')
plt.ylabel('Number of Medals')
plt.legend(title='Gender')
plt.show()
# If Age data is available, analyze performance by age
# Example assuming Age column is available
# Calculate the total medals won by age
# medals_by_age = df['Age'].value_counts().sort_index()
# print(medals_by_age)
# Plot the total medals won by age
# plt.figure(figsize=(12, 8))
# sns.barplot(x=medals_by_age.index, y=medals_by_age.values)
# plt.title('Total Medals Won by Age')
# plt.xlabel('Age')
# plt.ylabel('Number of Medals')
# plt.show()
Athlete PHELPS, Michael 16 TORRES, Dara 12 THOMPSON, Jenny 12 FISCHER, Birgit 12 ANDRIANOV, Nikolay 12 NEMOV, Alexei 12 COUGHLIN, Natalie 11 BIONDI, Matthew 11 VAN ALMSICK, Franziska 10 LEWIS, Carl 10 Name: count, dtype: int64
Top 10 Athletes: Index(['PHELPS, Michael', 'TORRES, Dara', 'THOMPSON, Jenny', 'FISCHER, Birgit',
'ANDRIANOV, Nikolay', 'NEMOV, Alexei', 'COUGHLIN, Natalie',
'BIONDI, Matthew', 'VAN ALMSICK, Franziska', 'LEWIS, Carl'],
dtype='object', name='Athlete')
City Year Sport Discipline Event \
689 Montreal 1976.0 Gymnastics Artistic G. parallel bars
691 Montreal 1976.0 Gymnastics Artistic G. vault
729 Montreal 1976.0 Gymnastics Artistic G. team competition
732 Montreal 1976.0 Gymnastics Artistic G. floor exercises
738 Montreal 1976.0 Gymnastics Artistic G. rings
... ... ... ... ... ...
13567 Beijing 2008.0 Aquatics Swimming 4x100m freestyle relay
13568 Beijing 2008.0 Aquatics Swimming 4x200m freestyle relay
13589 Beijing 2008.0 Aquatics Swimming 200m individual medley
13592 Beijing 2008.0 Aquatics Swimming 100m freestyle
13597 Beijing 2008.0 Aquatics Swimming 200m individual medley
Athlete Gender Country_Code Country Event_gender \
689 ANDRIANOV, Nikolay Men URS Soviet Union M
691 ANDRIANOV, Nikolay Men URS Soviet Union M
729 ANDRIANOV, Nikolay Men URS Soviet Union M
732 ANDRIANOV, Nikolay Men URS Soviet Union M
738 ANDRIANOV, Nikolay Men URS Soviet Union M
... ... ... ... ... ...
13567 PHELPS, Michael Men USA United States M
13568 PHELPS, Michael Men USA United States M
13589 COUGHLIN, Natalie Women USA United States W
13592 COUGHLIN, Natalie Women USA United States W
13597 PHELPS, Michael Men USA United States M
Medal
689 Silver
691 Gold
729 Silver
732 Gold
738 Gold
... ...
13567 Gold
13568 Gold
13589 Bronze
13592 Bronze
13597 Gold
[118 rows x 11 columns]
Gender
Men 9388
Women 5928
Name: count, dtype: int64
Gender Men Women Year 1976.0 924 381 1980.0 958 429 1984.0 973 486 1988.0 1011 535 1992.0 1105 600 1996.0 1082 777 2000.0 1126 889 2004.0 1099 899 2008.0 1110 932
In [10]:
#Sport and Event-Specific Trends:
#Break down the analysis by individual sports and events to identify trends and dominant nations in each.
#Analyze changes in sports disciplines over the years, including the introduction or removal of sports from the Olympics.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load the data
df = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\Summer-Olympic-medals-1976-to-2008.csv", encoding='latin1')
# Identify the most popular sports and events
medals_by_sport = df['Sport'].value_counts()
print(medals_by_sport.head(10))
plt.figure(figsize=(12, 8))
sns.barplot(y=medals_by_sport.index[:10], x=medals_by_sport.values[:10])
plt.title('Number of Medals Awarded in Each Sport (Top 10)')
plt.xlabel('Number of Medals')
plt.ylabel('Sport')
plt.show()
medals_by_event = df['Event'].value_counts()
print(medals_by_event.head(10))
plt.figure(figsize=(12, 8))
sns.barplot(y=medals_by_event.index[:10], x=medals_by_event.values[:10])
plt.title('Number of Medals Awarded in Each Event (Top 10)')
plt.xlabel('Number of Medals')
plt.ylabel('Event')
plt.show()
# Determine dominant nations in each sport
dominant_nations_by_sport = df.groupby(['Sport', 'Country']).size().reset_index(name='Medal Count')
dominant_nations_by_sport = dominant_nations_by_sport.sort_values(['Sport', 'Medal Count'], ascending=[True, False])
dominant_nations_by_sport = dominant_nations_by_sport.groupby('Sport').head(1)
print(dominant_nations_by_sport)
top_sports = medals_by_sport.index[:10]
dominant_nations_top_sports = dominant_nations_by_sport[dominant_nations_by_sport['Sport'].isin(top_sports)]
plt.figure(figsize=(12, 8))
sns.barplot(y='Sport', x='Medal Count', hue='Country', data=dominant_nations_top_sports)
plt.title('Dominant Nations in Each Sport (Top 10 Sports)')
plt.xlabel('Medal Count')
plt.ylabel('Sport')
plt.legend(title='Country')
plt.show()
# Analyze changes in sports disciplines over the years
medals_by_sport_year = df.groupby(['Year', 'Sport']).size().unstack().fillna(0)
print(medals_by_sport_year)
plt.figure(figsize=(14, 8))
medals_by_sport_year[top_sports].plot(kind='line', marker='o')
plt.title('Changes in the Number of Medals Awarded in Top Sports Over the Years')
plt.xlabel('Year')
plt.ylabel('Number of Medals')
plt.legend(title='Sport')
plt.show()
Sport Aquatics 2210 Athletics 1523 Rowing 1377 Hockey 817 Gymnastics 783 Handball 780 Volleyball 695 Canoe / Kayak 672 Football 669 Basketball 646 Name: count, dtype: int64
Event hockey 817 handball 780 football 669 volleyball 647 basketball 646 eight with coxswain (8+) 486 team 459 water polo 455 baseball 335 team competition 329 Name: count, dtype: int64
Sport Country Medal Count 43 Aquatics United States 578 56 Archery Korea, South 52 142 Athletics United States 299 145 Badminton China 47 153 Baseball Cuba 111 172 Basketball United States 192 186 Boxing Cuba 56 251 Canoe / Kayak Germany 96 277 Cycling Australia 54 333 Equestrian United States 73 338 Fencing France 115 356 Football Brazil 109 393 Gymnastics Romania 116 412 Handball Korea, South 96 431 Hockey Netherlands 159 464 Judo Japan 57 495 Modern Pentathlon Hungary 15 531 Rowing Romania 156 579 Sailing United States 64 590 Shooting China 42 636 Softball Australia 60 640 Table Tennis China 61 667 Taekwondo Korea, South 12 702 Tennis United States 24 704 Triathlon Australia 4 715 Volleyball Brazil 102 741 Weightlifting China 43 824 Wrestling United States 57
Sport Aquatics Archery Athletics Badminton Baseball Basketball Boxing \ Year 1976.0 159.0 6.0 147.0 0.0 0.0 72.0 44.0 1980.0 159.0 6.0 150.0 0.0 0.0 72.0 44.0 1984.0 192.0 6.0 161.0 0.0 0.0 72.0 48.0 1988.0 202.0 24.0 163.0 0.0 0.0 72.0 48.0 1992.0 228.0 24.0 178.0 24.0 60.0 72.0 48.0 1996.0 262.0 24.0 180.0 24.0 60.0 72.0 48.0 2000.0 329.0 24.0 184.0 24.0 72.0 72.0 48.0 2004.0 332.0 24.0 183.0 24.0 71.0 70.0 44.0 2008.0 347.0 24.0 177.0 24.0 72.0 72.0 44.0 Sport Canoe / Kayak Cycling Equestrian ... Sailing Shooting Softball \ Year ... 1976.0 57.0 36.0 42.0 ... 36.0 21.0 0.0 1980.0 57.0 36.0 62.0 ... 36.0 21.0 0.0 1984.0 69.0 42.0 47.0 ... 39.0 33.0 0.0 1988.0 69.0 45.0 45.0 ... 45.0 39.0 0.0 1992.0 84.0 50.0 45.0 ... 51.0 39.0 0.0 1996.0 84.0 52.0 45.0 ... 48.0 45.0 45.0 2000.0 84.0 75.0 45.0 ... 54.0 51.0 45.0 2004.0 84.0 76.0 47.0 ... 54.0 51.0 45.0 2008.0 84.0 71.0 45.0 ... 54.0 45.0 45.0 Sport Table Tennis Taekwondo Tennis Triathlon Volleyball Weightlifting \ Year 1976.0 0.0 0.0 0.0 0.0 72.0 27.0 1980.0 0.0 0.0 0.0 0.0 72.0 30.0 1984.0 0.0 0.0 0.0 0.0 72.0 30.0 1988.0 18.0 0.0 24.0 0.0 72.0 30.0 1992.0 24.0 0.0 24.0 0.0 72.0 29.0 1996.0 18.0 0.0 18.0 0.0 84.0 30.0 2000.0 18.0 24.0 18.0 6.0 84.0 45.0 2004.0 18.0 24.0 16.0 6.0 83.0 45.0 2008.0 24.0 32.0 18.0 6.0 84.0 45.0 Sport Wrestling Year 1976.0 60.0 1980.0 60.0 1984.0 60.0 1988.0 60.0 1992.0 60.0 1996.0 60.0 2000.0 48.0 2004.0 54.0 2008.0 71.0 [9 rows x 28 columns]
<Figure size 1400x800 with 0 Axes>
In [11]:
#Impact of Geopolitical Events:
#Examine the impact of major geopolitical events, like wars or boycotts, on the participation and success rates in the Olympics.
#Study the return or debut of nations in the Olympic games and their performances.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load the data
df = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\Summer-Olympic-medals-1976-to-2008.csv", encoding='latin1')
# Define the years of major geopolitical events
geopolitical_years = [1980, 1984]
# Analyze participation rates around these events
participation_by_year = df.groupby('Year')['Country'].nunique()
print(participation_by_year)
plt.figure(figsize=(12, 8))
participation_by_year.plot(kind='bar')
plt.axvline(x=participation_by_year.index.get_loc(1980), color='red', linestyle='--', label='1980 Moscow Boycott')
plt.axvline(x=participation_by_year.index.get_loc(1984), color='blue', linestyle='--', label='1984 LA Boycott')
plt.title('Number of Participating Countries in the Olympics (1976-2008)')
plt.xlabel('Year')
plt.ylabel('Number of Participating Countries')
plt.legend()
plt.show()
# Analyze success rates (medals won) around these events
medals_by_country_year = df.groupby(['Year', 'Country']).size().unstack().fillna(0)
print(medals_by_country_year)
top_countries = medals_by_country_year.sum().sort_values(ascending=False).head(5).index
plt.figure(figsize=(14, 8))
medals_by_country_year[top_countries].plot(kind='line', marker='o')
plt.axvline(x=participation_by_year.index.get_loc(1980), color='red', linestyle='--', label='1980 Moscow Boycott')
plt.axvline(x=participation_by_year.index.get_loc(1984), color='blue', linestyle='--', label='1984 LA Boycott')
plt.title('Total Medals Won by Top 5 Countries (1976-2008)')
plt.xlabel('Year')
plt.ylabel('Number of Medals')
plt.legend(title='Country')
plt.show()
# Study the return or debut of nations
debut_or_return = df.groupby('Country')['Year'].min().reset_index()
print(debut_or_return)
debut_1980 = df[df['Year'] == 1980]['Country'].unique()
debut_1984 = df[df['Year'] == 1984]['Country'].unique()
medals_by_debut_1980 = df[(df['Country'].isin(debut_1980)) & (df['Year'] == 1980)]['Country'].value_counts()
medals_by_debut_1984 = df[(df['Country'].isin(debut_1984)) & (df['Year'] == 1984)]['Country'].value_counts()
plt.figure(figsize=(12, 8))
sns.barplot(y=medals_by_debut_1980.index, x=medals_by_debut_1980.values)
plt.title('Medals Won by Debuting Nations in 1980')
plt.xlabel('Number of Medals')
plt.ylabel('Country')
plt.show()
plt.figure(figsize=(12, 8))
sns.barplot(y=medals_by_debut_1984.index, x=medals_by_debut_1984.values)
plt.title('Medals Won by Debuting Nations in 1984')
plt.xlabel('Number of Medals')
plt.ylabel('Country')
plt.show()
Year 1976.0 41 1980.0 36 1984.0 47 1988.0 52 1992.0 64 1996.0 79 2000.0 80 2004.0 74 2008.0 86 Name: Country, dtype: int64
Country Afghanistan Algeria Argentina Armenia Australia Austria \ Year 1976.0 0.0 0.0 0.0 0.0 24.0 1.0 1980.0 0.0 0.0 0.0 0.0 12.0 5.0 1984.0 0.0 2.0 0.0 0.0 50.0 3.0 1988.0 0.0 0.0 13.0 0.0 34.0 1.0 1992.0 0.0 2.0 2.0 0.0 57.0 6.0 1996.0 0.0 3.0 20.0 2.0 132.0 3.0 2000.0 0.0 5.0 20.0 1.0 183.0 4.0 2004.0 0.0 0.0 47.0 0.0 157.0 8.0 2008.0 1.0 2.0 51.0 6.0 149.0 3.0 Country Azerbaijan Bahamas Barbados Belarus ... United States Uruguay \ Year ... 1976.0 0.0 0.0 0.0 0.0 ... 155.0 0.0 1980.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1984.0 0.0 0.0 0.0 0.0 ... 333.0 0.0 1988.0 0.0 0.0 0.0 0.0 ... 193.0 0.0 1992.0 0.0 1.0 0.0 0.0 ... 224.0 0.0 1996.0 1.0 5.0 0.0 23.0 ... 260.0 0.0 2000.0 3.0 6.0 1.0 22.0 ... 248.0 1.0 2004.0 5.0 2.0 0.0 17.0 ... 264.0 0.0 2008.0 7.0 5.0 0.0 30.0 ... 315.0 0.0 Country Uzbekistan Venezuela Vietnam Virgin Islands* West Germany \ Year 1976.0 0.0 1.0 0.0 0.0 75.0 1980.0 0.0 1.0 0.0 0.0 0.0 1984.0 0.0 3.0 0.0 0.0 157.0 1988.0 0.0 0.0 0.0 1.0 113.0 1992.0 0.0 0.0 0.0 0.0 0.0 1996.0 2.0 0.0 0.0 0.0 0.0 2000.0 4.0 0.0 1.0 0.0 0.0 2004.0 5.0 2.0 0.0 0.0 0.0 2008.0 6.0 1.0 1.0 0.0 0.0 Country Yugoslavia Zambia Zimbabwe Year 1976.0 19.0 0.0 0.0 1980.0 57.0 0.0 16.0 1984.0 87.0 1.0 0.0 1988.0 63.0 0.0 0.0 1992.0 0.0 0.0 0.0 1996.0 26.0 1.0 0.0 2000.0 26.0 0.0 0.0 2004.0 0.0 0.0 3.0 2008.0 0.0 0.0 4.0 [9 rows x 127 columns]
<Figure size 1400x800 with 0 Axes>
Country Year 0 Afghanistan 2008.0 1 Algeria 1984.0 2 Argentina 1988.0 3 Armenia 1996.0 4 Australia 1976.0 .. ... ... 122 Virgin Islands* 1988.0 123 West Germany 1976.0 124 Yugoslavia 1976.0 125 Zambia 1984.0 126 Zimbabwe 1980.0 [127 rows x 2 columns]
In [27]:
#Predictive Analysis for Future Olympics:
#Use historical data to predict future medal counts and performances for upcoming Olympics.
#Develop models to forecast which countries might rise as new leaders in certain sports.
import pandas as pd
import numpy as np
# Convert the 'Year' column to an integer
#df['Year'] = df['Year'].astype(int)
# Check for missing values
df.isnull().sum()
# Fill missing values or drop rows/columns as appropriate
# In this case, we assume the dataset is clean and complete
Out[27]:
City 117 Year 117 Sport 117 Discipline 117 Event 117 Athlete 117 Gender 117 Country_Code 117 Country 117 Event_gender 117 Medal 117 dtype: int64
In [13]:
import matplotlib.pyplot as plt
import seaborn as sns
# Total medals won by country over the years
medals_by_country = df.groupby('Country').size().sort_values(ascending=False)
print(medals_by_country.head(10))
plt.figure(figsize=(12, 8))
sns.barplot(y=medals_by_country.index[:10], x=medals_by_country.values[:10])
plt.title('Total Medals Won by Country (Top 10)')
plt.xlabel('Number of Medals')
plt.ylabel('Country')
plt.show()
Country United States 1992 Soviet Union 1021 Australia 798 Germany 691 China 679 Russia 638 East Germany 626 Italy 486 Romania 482 United Kingdom 467 dtype: int64
In [31]:
# Import necessary libraries
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Load and preprocess the data
df = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\Summer-Olympic-medals-1976-to-2008.csv", encoding='latin1')
#df['Year'] = df['Year'].astype(int)
medals_by_country_sport_year = df.groupby(['Year', 'Country', 'Sport']).size().reset_index(name='Medal Count')
# Time Series Analysis
sport_data = medals_by_country_sport_year[medals_by_country_sport_year['Sport'] == 'Athletics']
pivot_table = sport_data.pivot(index='Year', columns='Country', values='Medal Count').fillna(0)
# Ensure the country data has enough points for ARIMA
if 'USA' in pivot_table.columns:
country_data = pivot_table['USA']
if len(country_data) > 0:
plt.figure(figsize=(12, 6))
plt.plot(country_data.index, country_data.values)
plt.xlabel('Year')
plt.ylabel('Medal Count')
plt.title('Medal Count for USA in Athletics Over the Years')
plt.show()
try:
model = sm.tsa.ARIMA(country_data, order=(1, 1, 1))
results = model.fit()
forecast = results.forecast(steps=4) # Predict for the next 4 Olympic games (16 years)
print("Forecasted Medal Count for USA in Athletics for the next 4 Olympics:", forecast)
except Exception as e:
print("ARIMA model fitting/forecasting error:", e)
else:
print("Not enough data for ARIMA model.")
else:
print("USA data not available in the dataset.")
# Machine Learning Approach
features = medals_by_country_sport_year[['Year', 'Country', 'Sport']]
features = pd.get_dummies(features)
target = medals_by_country_sport_year['Medal Count']
# Ensure there are enough data points
if not features.empty:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
# Check for sufficient training data
if len(X_train) > 0 and len(y_train) > 0:
# Initialize and train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Make predictions
predictions = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")
# Feature importance
importances = model.feature_importances_
feature_names = features.columns
sorted_importances = sorted(zip(importances, feature_names), reverse=True)
print("Top 10 Feature Importances:")
for importance, name in sorted_importances[:10]:
print(f"{name}: {importance:.4f}")
else:
print("Not enough data for training the Random Forest model.")
else:
print("Not enough data for Random Forest model.")
USA data not available in the dataset. Mean Squared Error: 15.310240155945417 Top 10 Feature Importances: Sport_Aquatics: 0.1760 Year: 0.1156 Country_United States: 0.0996 Sport_Athletics: 0.0589 Country_Soviet Union: 0.0560 Sport_Hockey: 0.0459 Sport_Handball: 0.0371 Sport_Rowing: 0.0363 Country_East Germany: 0.0353 Sport_Football: 0.0316
In [ ]:
#Data Visualization and Reporting:
#Create dynamic visualizations and dashboards that allow stakeholders to interact with the data and extract personalized insights.
#Prepare a detailed report summarizing the findings, methodologies, and implications for Olympic committees and sports analysts.
In [ ]:
In [ ]:
In [ ]:
In [21]:
pip install plotly dash pandas
Requirement already satisfied: plotly in c:\users\lenovo\anaconda\lib\site-packages (5.9.0)
Collecting dash
Downloading dash-2.17.1-py3-none-any.whl.metadata (10 kB)
Requirement already satisfied: pandas in c:\users\lenovo\anaconda\lib\site-packages (2.1.4)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\lenovo\anaconda\lib\site-packages (from plotly) (8.2.2)
Requirement already satisfied: Flask<3.1,>=1.0.4 in c:\users\lenovo\anaconda\lib\site-packages (from dash) (2.2.5)
Requirement already satisfied: Werkzeug<3.1 in c:\users\lenovo\anaconda\lib\site-packages (from dash) (2.2.3)
Collecting dash-html-components==2.0.0 (from dash)
Downloading dash_html_components-2.0.0-py3-none-any.whl.metadata (3.8 kB)
Collecting dash-core-components==2.0.0 (from dash)
Downloading dash_core_components-2.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting dash-table==5.0.0 (from dash)
Downloading dash_table-5.0.0-py3-none-any.whl.metadata (2.4 kB)
Requirement already satisfied: importlib-metadata in c:\users\lenovo\anaconda\lib\site-packages (from dash) (7.0.1)
Requirement already satisfied: typing-extensions>=4.1.1 in c:\users\lenovo\anaconda\lib\site-packages (from dash) (4.9.0)
Requirement already satisfied: requests in c:\users\lenovo\anaconda\lib\site-packages (from dash) (2.31.0)
Collecting retrying (from dash)
Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Requirement already satisfied: nest-asyncio in c:\users\lenovo\anaconda\lib\site-packages (from dash) (1.6.0)
Requirement already satisfied: setuptools in c:\users\lenovo\anaconda\lib\site-packages (from dash) (68.2.2)
Requirement already satisfied: numpy<2,>=1.23.2 in c:\users\lenovo\anaconda\lib\site-packages (from pandas) (1.26.4)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\lenovo\anaconda\lib\site-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\lenovo\anaconda\lib\site-packages (from pandas) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in c:\users\lenovo\anaconda\lib\site-packages (from pandas) (2023.3)
Requirement already satisfied: Jinja2>=3.0 in c:\users\lenovo\anaconda\lib\site-packages (from Flask<3.1,>=1.0.4->dash) (3.1.3)
Requirement already satisfied: itsdangerous>=2.0 in c:\users\lenovo\anaconda\lib\site-packages (from Flask<3.1,>=1.0.4->dash) (2.0.1)
Requirement already satisfied: click>=8.0 in c:\users\lenovo\anaconda\lib\site-packages (from Flask<3.1,>=1.0.4->dash) (8.1.7)
Requirement already satisfied: six>=1.5 in c:\users\lenovo\anaconda\lib\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Requirement already satisfied: MarkupSafe>=2.1.1 in c:\users\lenovo\anaconda\lib\site-packages (from Werkzeug<3.1->dash) (2.1.3)
Requirement already satisfied: zipp>=0.5 in c:\users\lenovo\anaconda\lib\site-packages (from importlib-metadata->dash) (3.17.0)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\lenovo\anaconda\lib\site-packages (from requests->dash) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\lenovo\anaconda\lib\site-packages (from requests->dash) (3.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\lenovo\anaconda\lib\site-packages (from requests->dash) (2.0.7)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\lenovo\anaconda\lib\site-packages (from requests->dash) (2024.2.2)
Requirement already satisfied: colorama in c:\users\lenovo\anaconda\lib\site-packages (from click>=8.0->Flask<3.1,>=1.0.4->dash) (0.4.6)
Downloading dash-2.17.1-py3-none-any.whl (7.5 MB)
---------------------------------------- 0.0/7.5 MB ? eta -:--:--
---------------------------------------- 0.0/7.5 MB 1.4 MB/s eta 0:00:06
---------------------------------------- 0.0/7.5 MB 653.6 kB/s eta 0:00:12
---------------------------------------- 0.1/7.5 MB 550.5 kB/s eta 0:00:14
---------------------------------------- 0.1/7.5 MB 550.5 kB/s eta 0:00:14
---------------------------------------- 0.1/7.5 MB 459.5 kB/s eta 0:00:17
---------------------------------------- 0.1/7.5 MB 403.5 kB/s eta 0:00:19
--------------------------------------- 0.1/7.5 MB 425.1 kB/s eta 0:00:18
--------------------------------------- 0.1/7.5 MB 448.2 kB/s eta 0:00:17
--------------------------------------- 0.2/7.5 MB 393.8 kB/s eta 0:00:19
--------------------------------------- 0.2/7.5 MB 393.8 kB/s eta 0:00:19
--------------------------------------- 0.2/7.5 MB 338.5 kB/s eta 0:00:22
- -------------------------------------- 0.3/7.5 MB 505.4 kB/s eta 0:00:15
-- ------------------------------------- 0.4/7.5 MB 672.6 kB/s eta 0:00:11
-- ------------------------------------- 0.5/7.5 MB 720.0 kB/s eta 0:00:10
-- ------------------------------------- 0.5/7.5 MB 731.4 kB/s eta 0:00:10
--- ------------------------------------ 0.6/7.5 MB 784.4 kB/s eta 0:00:09
--- ------------------------------------ 0.7/7.5 MB 865.4 kB/s eta 0:00:08
---- ----------------------------------- 0.8/7.5 MB 927.5 kB/s eta 0:00:08
---- ----------------------------------- 0.8/7.5 MB 936.2 kB/s eta 0:00:08
---- ----------------------------------- 0.9/7.5 MB 921.8 kB/s eta 0:00:08
---- ----------------------------------- 0.9/7.5 MB 921.8 kB/s eta 0:00:08
---- ----------------------------------- 0.9/7.5 MB 901.1 kB/s eta 0:00:08
---- ----------------------------------- 0.9/7.5 MB 880.2 kB/s eta 0:00:08
---- ----------------------------------- 0.9/7.5 MB 880.2 kB/s eta 0:00:08
----- ---------------------------------- 1.0/7.5 MB 835.4 kB/s eta 0:00:08
----- ---------------------------------- 1.0/7.5 MB 856.2 kB/s eta 0:00:08
----- ---------------------------------- 1.0/7.5 MB 832.3 kB/s eta 0:00:08
----- ---------------------------------- 1.1/7.5 MB 852.0 kB/s eta 0:00:08
------ --------------------------------- 1.1/7.5 MB 876.4 kB/s eta 0:00:08
------ --------------------------------- 1.2/7.5 MB 902.2 kB/s eta 0:00:07
------ --------------------------------- 1.3/7.5 MB 920.9 kB/s eta 0:00:07
------- -------------------------------- 1.4/7.5 MB 979.4 kB/s eta 0:00:07
------- -------------------------------- 1.5/7.5 MB 972.9 kB/s eta 0:00:07
-------- ------------------------------- 1.6/7.5 MB 1.0 MB/s eta 0:00:06
-------- ------------------------------- 1.6/7.5 MB 1.0 MB/s eta 0:00:06
--------- ------------------------------ 1.8/7.5 MB 1.1 MB/s eta 0:00:06
--------- ------------------------------ 1.8/7.5 MB 1.1 MB/s eta 0:00:06
---------- ----------------------------- 1.9/7.5 MB 1.1 MB/s eta 0:00:05
---------- ----------------------------- 2.0/7.5 MB 1.2 MB/s eta 0:00:05
----------- ---------------------------- 2.2/7.5 MB 1.2 MB/s eta 0:00:05
----------- ---------------------------- 2.2/7.5 MB 1.2 MB/s eta 0:00:05
------------ --------------------------- 2.3/7.5 MB 1.2 MB/s eta 0:00:05
------------ --------------------------- 2.3/7.5 MB 1.2 MB/s eta 0:00:05
------------ --------------------------- 2.3/7.5 MB 1.2 MB/s eta 0:00:05
------------ --------------------------- 2.4/7.5 MB 1.2 MB/s eta 0:00:05
------------ --------------------------- 2.4/7.5 MB 1.2 MB/s eta 0:00:05
------------- -------------------------- 2.5/7.5 MB 1.2 MB/s eta 0:00:05
------------- -------------------------- 2.5/7.5 MB 1.2 MB/s eta 0:00:05
------------- -------------------------- 2.6/7.5 MB 1.2 MB/s eta 0:00:05
-------------- ------------------------- 2.7/7.5 MB 1.2 MB/s eta 0:00:05
-------------- ------------------------- 2.7/7.5 MB 1.2 MB/s eta 0:00:04
-------------- ------------------------- 2.7/7.5 MB 1.2 MB/s eta 0:00:04
--------------- ------------------------ 2.8/7.5 MB 1.2 MB/s eta 0:00:04
--------------- ------------------------ 2.9/7.5 MB 1.2 MB/s eta 0:00:04
--------------- ------------------------ 2.9/7.5 MB 1.2 MB/s eta 0:00:04
--------------- ------------------------ 3.0/7.5 MB 1.2 MB/s eta 0:00:04
---------------- ----------------------- 3.0/7.5 MB 1.2 MB/s eta 0:00:04
---------------- ----------------------- 3.1/7.5 MB 1.2 MB/s eta 0:00:04
---------------- ----------------------- 3.2/7.5 MB 1.2 MB/s eta 0:00:04
----------------- ---------------------- 3.2/7.5 MB 1.2 MB/s eta 0:00:04
----------------- ---------------------- 3.3/7.5 MB 1.2 MB/s eta 0:00:04
----------------- ---------------------- 3.4/7.5 MB 1.2 MB/s eta 0:00:04
------------------ --------------------- 3.4/7.5 MB 1.3 MB/s eta 0:00:04
------------------ --------------------- 3.5/7.5 MB 1.3 MB/s eta 0:00:04
------------------- -------------------- 3.6/7.5 MB 1.3 MB/s eta 0:00:04
-------------------- ------------------- 3.8/7.5 MB 1.3 MB/s eta 0:00:03
-------------------- ------------------- 3.9/7.5 MB 1.3 MB/s eta 0:00:03
-------------------- ------------------- 3.9/7.5 MB 1.3 MB/s eta 0:00:03
--------------------- ------------------ 4.0/7.5 MB 1.3 MB/s eta 0:00:03
--------------------- ------------------ 4.1/7.5 MB 1.3 MB/s eta 0:00:03
---------------------- ----------------- 4.2/7.5 MB 1.3 MB/s eta 0:00:03
---------------------- ----------------- 4.2/7.5 MB 1.3 MB/s eta 0:00:03
---------------------- ----------------- 4.3/7.5 MB 1.3 MB/s eta 0:00:03
----------------------- ---------------- 4.4/7.5 MB 1.4 MB/s eta 0:00:03
------------------------ --------------- 4.5/7.5 MB 1.4 MB/s eta 0:00:03
------------------------ --------------- 4.6/7.5 MB 1.4 MB/s eta 0:00:03
------------------------ --------------- 4.7/7.5 MB 1.4 MB/s eta 0:00:03
------------------------- -------------- 4.8/7.5 MB 1.4 MB/s eta 0:00:02
-------------------------- ------------- 4.9/7.5 MB 1.4 MB/s eta 0:00:02
-------------------------- ------------- 5.0/7.5 MB 1.4 MB/s eta 0:00:02
--------------------------- ------------ 5.1/7.5 MB 1.4 MB/s eta 0:00:02
--------------------------- ------------ 5.2/7.5 MB 1.4 MB/s eta 0:00:02
--------------------------- ------------ 5.2/7.5 MB 1.4 MB/s eta 0:00:02
---------------------------- ----------- 5.3/7.5 MB 1.4 MB/s eta 0:00:02
---------------------------- ----------- 5.3/7.5 MB 1.4 MB/s eta 0:00:02
----------------------------- ---------- 5.4/7.5 MB 1.4 MB/s eta 0:00:02
----------------------------- ---------- 5.5/7.5 MB 1.4 MB/s eta 0:00:02
----------------------------- ---------- 5.6/7.5 MB 1.4 MB/s eta 0:00:02
----------------------------- ---------- 5.6/7.5 MB 1.4 MB/s eta 0:00:02
------------------------------ --------- 5.7/7.5 MB 1.4 MB/s eta 0:00:02
------------------------------ --------- 5.7/7.5 MB 1.4 MB/s eta 0:00:02
------------------------------ --------- 5.7/7.5 MB 1.4 MB/s eta 0:00:02
------------------------------ --------- 5.8/7.5 MB 1.4 MB/s eta 0:00:02
------------------------------- -------- 5.8/7.5 MB 1.4 MB/s eta 0:00:02
------------------------------- -------- 5.8/7.5 MB 1.4 MB/s eta 0:00:02
------------------------------- -------- 5.9/7.5 MB 1.4 MB/s eta 0:00:02
-------------------------------- ------- 6.0/7.5 MB 1.4 MB/s eta 0:00:02
-------------------------------- ------- 6.1/7.5 MB 1.4 MB/s eta 0:00:02
-------------------------------- ------- 6.1/7.5 MB 1.4 MB/s eta 0:00:02
--------------------------------- ------ 6.2/7.5 MB 1.4 MB/s eta 0:00:01
--------------------------------- ------ 6.3/7.5 MB 1.4 MB/s eta 0:00:01
--------------------------------- ------ 6.3/7.5 MB 1.4 MB/s eta 0:00:01
--------------------------------- ------ 6.3/7.5 MB 1.4 MB/s eta 0:00:01
---------------------------------- ----- 6.4/7.5 MB 1.3 MB/s eta 0:00:01
---------------------------------- ----- 6.4/7.5 MB 1.3 MB/s eta 0:00:01
---------------------------------- ----- 6.5/7.5 MB 1.3 MB/s eta 0:00:01
---------------------------------- ----- 6.5/7.5 MB 1.3 MB/s eta 0:00:01
---------------------------------- ----- 6.5/7.5 MB 1.3 MB/s eta 0:00:01
----------------------------------- ---- 6.6/7.5 MB 1.3 MB/s eta 0:00:01
----------------------------------- ---- 6.7/7.5 MB 1.3 MB/s eta 0:00:01
------------------------------------ --- 6.8/7.5 MB 1.3 MB/s eta 0:00:01
------------------------------------ --- 6.9/7.5 MB 1.3 MB/s eta 0:00:01
------------------------------------ --- 6.9/7.5 MB 1.3 MB/s eta 0:00:01
------------------------------------- -- 7.0/7.5 MB 1.3 MB/s eta 0:00:01
------------------------------------- -- 7.1/7.5 MB 1.3 MB/s eta 0:00:01
-------------------------------------- - 7.2/7.5 MB 1.3 MB/s eta 0:00:01
-------------------------------------- - 7.2/7.5 MB 1.3 MB/s eta 0:00:01
-------------------------------------- - 7.2/7.5 MB 1.3 MB/s eta 0:00:01
--------------------------------------- 7.3/7.5 MB 1.3 MB/s eta 0:00:01
--------------------------------------- 7.4/7.5 MB 1.3 MB/s eta 0:00:01
--------------------------------------- 7.5/7.5 MB 1.3 MB/s eta 0:00:01
--------------------------------------- 7.5/7.5 MB 1.3 MB/s eta 0:00:01
---------------------------------------- 7.5/7.5 MB 1.3 MB/s eta 0:00:00
Downloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)
Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: dash-table, dash-html-components, dash-core-components, retrying, dash
Successfully installed dash-2.17.1 dash-core-components-2.0.0 dash-html-components-2.0.0 dash-table-5.0.0 retrying-1.3.4
Note: you may need to restart the kernel to use updated packages.
In [23]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
# Load the data
df = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\Summer-Olympic-medals-1976-to-2008.csv", encoding='latin1')
# Convert 'Year' column to an integer
#df['Year'] = df['Year'].astype(int)
# Aggregate data by country and year
medals_by_country_year = df.groupby(['Year', 'Country']).size().reset_index(name='Medal Count')
# Pivot the table for easier visualization
pivot_table = medals_by_country_year.pivot(index='Year', columns='Country', values='Medal Count').fillna(0)
pivot_table = pivot_table.reset_index()
In [24]:
# Line plot for total medals won by top 10 countries over the years
top_10_countries = df['Country'].value_counts().index[:10]
medals_by_year_top_10 = medals_by_country_year[medals_by_country_year['Country'].isin(top_10_countries)]
fig = px.line(medals_by_year_top_10, x='Year', y='Medal Count', color='Country', title='Total Medals Won by Top 10 Countries Over the Years')
fig.show()
# Bar plot for medals won by country in a specific year
year = 2008
medals_2008 = medals_by_country_year[medals_by_country_year['Year'] == year]
fig = px.bar(medals_2008, x='Country', y='Medal Count', title=f'Medals Won by Country in {year}')
fig.show()
# Heatmap for medals won by country and year
fig = px.imshow(pivot_table.set_index('Year'), title='Heatmap of Medals Won by Country and Year')
fig.show()
In [25]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
# Initialize the Dash app
app = dash.Dash(__name__)
# Layout of the app
app.layout = html.Div([
html.H1("Olympic Medals Dashboard"),
dcc.Dropdown(
id='country-dropdown',
options=[{'label': country, 'value': country} for country in df['Country'].unique()],
value='USA',
multi=True
),
dcc.Graph(id='medals-line-plot'),
dcc.Slider(
id='year-slider',
min=df['Year'].min(),
max=df['Year'].max(),
value=df['Year'].max(),
marks={str(year): str(year) for year in df['Year'].unique()},
step=None
),
dcc.Graph(id='medals-bar-plot')
])
# Callback for updating the line plot based on selected countries
@app.callback(
Output('medals-line-plot', 'figure'),
Input('country-dropdown', 'value')
)
def update_line_plot(selected_countries):
if not selected_countries:
return {}
medals_by_year_selected = medals_by_country_year[medals_by_country_year['Country'].isin(selected_countries)]
fig = px.line(medals_by_year_selected, x='Year', y='Medal Count', color='Country', title='Total Medals Won by Selected Countries Over the Years')
return fig
# Callback for updating the bar plot based on selected year
@app.callback(
Output('medals-bar-plot', 'figure'),
Input('year-slider', 'value')
)
def update_bar_plot(selected_year):
medals_selected_year = medals_by_country_year[medals_by_country_year['Year'] == selected_year]
fig = px.bar(medals_selected_year, x='Country', y='Medal Count', title=f'Medals Won by Country in {selected_year}')
return fig
# Run the app
if __name__ == '__main__':
app.run_server(debug=True)
In [ ]: